SC_Prostate_Run_1 analysis.

Analyiis of scRNA-seq data provided by Eduardo Andrés León (eduardo.andres@csic.es, Instituto de Parasitología y Biología López Neyra, Granada). SC data was generated and pre-analysed sing BD Rhapsody systems and Illumina sequencers. BD includes pipelines for read mapping and putative cell filtering. An analysis using the Seurat package (https://satijalab.org/seurat/) was done by the CSIC staff using the seven bridges platform, but an obsolete normalization method was used. This R markdown file describes a preliminary analysis for the first of the four runs the project contains.

library(Seurat)
library(SeuratData)
library(dplyr)
library(patchwork)
library(sctransform)
library(ggplot2)

1. Import data and exploratory analysis.

Import data from RDS file provided by Eduardo Andrés León (eduardo.andres@csic.es, Instituto de Parasitología y Biología López Neyra, Granada).

1.1. Raw counts.

run1.raw <- readRDS("../Data/run1/C1_Seurat.rds")
run1.raw
## An object of class Seurat 
## 34627 features across 3809 samples within 1 assay 
## Active assay: RNA (34627 features, 0 variable features)
##  2 layers present: counts, data
##  1 dimensional reduction calculated: tsne
run1.raw@meta.data[1:5, ]
##         orig.ident nCount_RNA nFeature_RNA Cell_Type_Experimental
## 450  SeuratProject       5177         1952           T_CD4_memory
## 776  SeuratProject      15741         3215           T_CD8_memory
## 828  SeuratProject       4063          343         Natural_killer
## 1237 SeuratProject      16465         3414                      B
## 1546 SeuratProject       8684         3104           T_CD8_memory
##          Sample_Tag    Sample_Name
## 450       Multiplet      Multiplet
## 776  SampleTag05_hs SampleTag05_hs
## 828  SampleTag06_hs SampleTag06_hs
## 1237      Multiplet      Multiplet
## 1546 SampleTag05_hs SampleTag05_hs
run1.raw[["RNA"]]$counts[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
##          450 776 828 1237 1546
## A1BG       .   .   .    .    .
## A1BG-AS1   .   .   .    1    .
## A1CF       .   .   .    .    .
## A2M        .   .   .    .    .
## A2M-AS1    .   .   .    .    .
head(colnames(run1.raw[["RNA"]]$counts), 10)
##  [1] "450"  "776"  "828"  "1237" "1546" "3094" "3103" "3496" "5061" "6190"
# Cells are identified by an integer, not their UMI.

1.2. Pre-processed counts.

run1 <- readRDS("../Data/run1/C1_Seurat_Edu.rds")
run1
## An object of class Seurat 
## 34627 features across 2151 samples within 1 assay 
## Active assay: RNA (34627 features, 2000 variable features)
##  3 layers present: counts, data, scale.data
##  3 dimensional reductions calculated: tsne, pca, umap
run1@meta.data[1:5, ]
##         orig.ident nCount_RNA nFeature_RNA Cell_Type_Experimental
## 776  SeuratProject      15741         3215           T_CD8_memory
## 1546 SeuratProject       8684         3104           T_CD8_memory
## 3103 SeuratProject       7442         2341            T_CD8_naive
## 3496 SeuratProject      14109         3444           T_CD8_memory
## 6190 SeuratProject       4366         1618         Natural_killer
##          Sample_Tag   Sample_Name percent.mt RNA_snn_res.0.5 seurat_clusters
## 776  SampleTag05_hs CaPBi-23-21-S  10.488533               1               1
## 1546 SampleTag05_hs CaPBi-23-21-S   5.354675               0               0
## 3103 SampleTag05_hs CaPBi-23-21-S  10.279495               1               1
## 3496 SampleTag03_hs CaPBi-23-20-S  12.786165               1               1
## 6190 SampleTag06_hs CaPBi-23-21-T   1.351351               4               4
##                     PreciseCell      GeneralCell Type
## 776                     NK_cell          T_cells    N
## 1546                T_cell:CD8+          T_cells    N
## 3103 T_cell:CD8+_Central_memory          T_cells    N
## 3496 T_cell:CD8+_Central_memory          T_cells   AT
## 6190 Epithelial_cells:bronchial Epithelial_cells    N
run1[["RNA"]]$counts[1:5, 1:5]
## 5 x 5 sparse Matrix of class "dgCMatrix"
##          776 1546 3103 3496 6190
## A1BG       .    .    .    .    .
## A1BG-AS1   .    .    .    .    .
## A1CF       .    .    .    .    .
## A2M        .    .    .    .    .
## A2M-AS1    .    .    .    .    .
head(colnames(run1[["RNA"]]$counts), 10)
##  [1] "776"   "1546"  "3103"  "3496"  "6190"  "6544"  "10044" "10067" "10757"
## [10] "11599"

2. QC.

2.1. Mitochondrial gene percentage calculation.

run1.raw[["percent.mt"]] <- PercentageFeatureSet(run1.raw, pattern = "^MT-")

run1.raw[["percent.mt"]][1:5, ]
## [1]  6.857253 10.177244 76.913611  6.401458  5.043759

2.2. QC visualization.

2.3. Filtering cells.

A. Filtering 1 % top and bottom percentiles.

minCov = 1000  #if a sample has a good coverage (>=minCov), then don't set a lower thresold for nCount, it's already pretty good. 
if (min(run1.raw$nCount_RNA) >= minCov) {
    countLOW = min(run1.raw$nCount_RNA)
} else {
    countLOW = quantile(run1.raw$nCount_RNA, prob = c(0.01))
}
countHIGH = quantile(run1.raw$nCount_RNA, prob = 0.99)
featureHIGH = quantile(run1.raw$nFeature_RNA, prob = 0.99)
featureLOW = quantile(run1.raw$nFeature_RNA, prob = 0.01)

# subset
run1.subset <- subset(run1.raw, subset = nFeature_RNA > featureLOW & nFeature_RNA < featureHIGH &
    nCount_RNA > countLOW & nCount_RNA < countHIGH & percent.mt < 25)

run1.subset
## An object of class Seurat 
## 34627 features across 2613 samples within 1 assay 
## Active assay: RNA (34627 features, 0 variable features)
##  2 layers present: counts, data
##  1 dimensional reduction calculated: tsne

B. Filtering mean +- 2*SD.

# # Get filtering parameters count.max <- round(mean(run1.raw$nCount_RNA) + 2 *
# sd(run1.raw$nCount_RNA), digits = -2) count.min <- round(mean(run1.raw$nCount_RNA) - 2 *
# sd(run1.raw$nCount_RNA), digits = -2) feat.max <- round(mean(run1.raw$nFeature_RNA) + 2 *
# sd(run1.raw$nFeature_RNA), digits = -2) feat.min <- round(mean(run1.raw$nFeature_RNA) - 2 *
# sd(run1.raw$nFeature_RNA), digits = -2) # Set minimum parameters to 0 if negative value if
# (count.min < 0){ count.min <- 0 } else { count.min <- count.min } if (feat.min < 0){
# feat.min <- 0 } else { feat.min <- feat.min } ## Filter cells run1.subset <-
# subset(run1.raw, subset = nFeature_RNA > feat.min & nFeature_RNA < feat.max & nCount_RNA <
# count.max & nCount_RNA > count.min & percent.mt < 25) run1.subset

2.4. Normalization using SCTransform.

# Filtered using 2.3.A.: 1% top and bottom percentiles.

run1.subset <- SCTransform(run1.subset, vst.flavor = "v2", vars.to.regress = "percent.mt")

2.5. Visualizing QC parameters after normalization.

# Visualize QC metrics as a violin plot
VlnPlot(run1.subset, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)

# Zoom in on nCount_RNA violin plot.
VlnPlot(run1.subset, features = "nCount_RNA", ncol = 1) + ylim(0, 25000) + NoLegend()

# Visualize relationships in metadata to detect outliers with FeatureScatter function
plot1 <- FeatureScatter(run1.subset, feature1 = "nCount_RNA", feature2 = "percent.mt")
plot2 <- FeatureScatter(run1.subset, feature1 = "nCount_RNA", feature2 = "nFeature_RNA")
plot1 + plot2

3. Identification of highly variable features (feature selection).

run1.subset <- FindVariableFeatures(run1.subset, selection.method = "vst", nfeatures = 2000)

# Genes with differencial expression in different cells are good candidates to be biomarkers.
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(run1.subset), 10)

# plot variable features with and without labels
plot1 <- VariableFeaturePlot(run1.subset)
plot2 <- LabelPoints(plot = plot1, points = top10, repel = TRUE)
plot1 + plot2

4. Linear dimensional reduction (PCA).

5. Clustering

run1.subset <- FindNeighbors(run1.subset, dims = 1:20)
run1.subset <- FindClusters(run1.subset)  # Default resolution = 0.8
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2613
## Number of edges: 80705
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8964
## Number of communities: 19
## Elapsed time: 0 seconds

saveRDS(run1, file = "../Ouput/Run1_raw.rds")

Data analysis using pre-processed data.

## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 2151
## Number of edges: 64853
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8949
## Number of communities: 19
## Elapsed time: 0 seconds